suppressPackageStartupMessages(suppressWarnings({
library(tidyverse)
library(data.table)
library(knitr)
library(gridExtra)
}))
opts_chunk$set(cache=TRUE, echo=TRUE, message=FALSE, warning=FALSE)
load("../results/wc_sims.rdata")
What fraction of discovered SNPs will be significantly overestimated for a given minimum rsq value?
ggplot(l, aes(x=nid, y=nsig_overestimate / nsig)) +
geom_point(aes(colour=as.factor(nsnp))) +
geom_smooth(aes(colour=as.factor(nsnp))) +
scale_x_log10() +
ylim(c(0,1)) +
scale_colour_brewer(type="qual") +
facet_grid(h2 ~ .)
And what fraction will be weak?
p0 <- l %>%
mutate(h2 = paste0("h2 = ", h2), S=paste0("S = ", S)) %>%
ggplot(., aes(x=nid, y=nsig_overestimate / nsig)) +
geom_point(aes(colour=as.factor(nsnp)), size=0.4) +
geom_smooth(aes(colour=as.factor(nsnp)), se=FALSE) +
scale_colour_brewer(type="seq", palette=2) +
facet_grid(h2 ~ S) +
ylim(c(0,1)) +
labs(x="Sample size", y="Proportion of discovered effects overestimated", colour="Number of causal variants") +
theme(axis.text.x=element_text(angle=90, hjust=1, vjust=0.5), legend.position="bottom")
p0
Relationship between overestimates and weak? As power gets better more of the overestimates are still strong
ggplot(l, aes(x=nsig_overestimate, y=nsig_weak)) +
geom_point(aes(colour=as.factor(nsnp))) +
geom_smooth(aes(colour=as.factor(nsnp))) +
scale_x_log10() +
scale_colour_brewer(type="qual") +
facet_grid(h2 ~ S)
How many just overestimated
ggplot(l, aes(x=nid, y=nsig_overestimate)) +
geom_point(aes(colour=as.factor(nsnp))) +
geom_smooth(aes(colour=as.factor(nsnp))) +
scale_x_log10() +
scale_colour_brewer(type="qual") +
facet_grid(h2 ~ S)
ggplot(l, aes(x=nid, y=nsig_overestimate/nsig)) +
geom_point(aes(colour=as.factor(nsnp))) +
geom_smooth(aes(colour=as.factor(nsnp))) +
scale_x_log10() +
scale_colour_brewer(type="qual") +
facet_grid(h2 ~ S)
ggplot(l, aes(x=nid, y=min_rsq)) +
geom_point(aes(colour=as.factor(nsnp))) +
geom_smooth(aes(colour=as.factor(nsnp))) +
scale_colour_brewer(type="qual") +
facet_grid(h2 ~ S)
ggplot(l, aes(x=nid, y=nsig)) +
geom_point(aes(colour=as.factor(nsnp))) +
geom_smooth(aes(colour=as.factor(nsnp))) +
scale_colour_brewer(type="qual") +
facet_grid(h2 ~ S)
ggplot(l, aes(x=nsig, y=nsig_overestimate/nsig)) +
geom_point(aes(colour=as.factor(nsnp))) +
geom_smooth(aes(colour=as.factor(nsnp))) +
scale_colour_brewer(type="qual") +
facet_grid(h2 ~ S)
load("../results/instrument_list.rdata")
counts <- fread("../results/count_tophits.txt")
instrument_counts <- subset(instrument_counts, !is.na(BETA.disc))
weak_p <- pf(10, 1, 100000, low=FALSE)
Quick summary
s1 <- instrument_counts %>%
group_by(id) %>%
summarise(
ndisc=sum(P_BOLT_LMM_INF.disc < 5e-8),
nrepl=sum(P_BOLT_LMM_INF.repl < 5e-8),
nhalf=mean(c(ndisc, nrepl)),
ntot=sum(P_BOLT_LMM_INF.disc < 5e-8 | P_BOLT_LMM_INF.repl < 5e-8),
ndisc_weak=sum(P_BOLT_LMM_INF.disc < 5e-8 & P_BOLT_LMM_INF.repl > weak_p),
nrepl_weak=sum(P_BOLT_LMM_INF.repl < 5e-8 & P_BOLT_LMM_INF.disc > weak_p),
ndisc_overestimated=sum(P_BOLT_LMM_INF.disc < 5e-8 & (abs(BETA.disc) - 1.96 * SE.disc) > abs(BETA.repl)),
nrepl_overestimated=sum(P_BOLT_LMM_INF.repl < 5e-8 & (abs(BETA.repl) - 1.96 * SE.repl) > abs(BETA.disc)),
nhalf_overestimated=mean(c(ndisc_overestimated, nrepl_overestimated)),
ndisc_underestimated=sum(P_BOLT_LMM_INF.disc < 5e-8 & (abs(BETA.disc) + 1.96 * SE.disc) < abs(BETA.repl)),
nrepl_underestimated=sum(P_BOLT_LMM_INF.repl < 5e-8 & (abs(BETA.repl) + 1.96 * SE.repl) < abs(BETA.disc)),
nhalf_underestimated=mean(c(ndisc_underestimated, nrepl_underestimated)),
nhalf_weak=mean(c(ndisc_weak, nrepl_weak))
)
Number of traits with an instrument
sum(s1$nhalf > 0)
## [1] 681
Total number of instruments
sum(s1$nhalf)
## [1] 13698
sum(s1$ndisc)
## [1] 13673
sum(s1$nrepl)
## [1] 13723
sum(s1$ntot)
## [1] 18482
how many instruments per trait
summary(s1$ndisc)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 1.00 12.13 4.00 477.00
summary(s1$nrepl)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 1.00 12.18 3.00 460.00
summary(s1$ndisc[s1$ndisc != 0])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 1.00 3.00 23.17 11.00 477.00
summary(s1$nrepl[s1$nrepl != 0])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 1.00 3.00 23.54 11.00 460.00
Number of weak instruments
sum(s1$nhalf_weak)
## [1] 681
sum(s1$ndisc_weak)
## [1] 734
sum(s1$nrepl_weak)
## [1] 628
Number of overestimated instruments
sum(s1$nhalf_overestimated)
## [1] 2589
sum(s1$ndisc_overestimated)
## [1] 2623
sum(s1$nrepl_overestimated)
## [1] 2555
Overestimated instruments
summary(s1$nhalf_overestimated[s1$nhalf != 0])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.500 0.500 3.802 2.500 60.000
summary(s1$ndisc_overestimated[s1$ndisc != 0])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 1.000 4.446 3.000 56.000
summary(s1$nrepl_overestimated[s1$nrepl != 0])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 1.000 4.383 4.000 64.000
summary(s1$nhalf_overestimated/s1$nhalf)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0000 0.1163 0.2302 0.3548 0.5000 1.0000 446
summary(s1$ndisc_overestimated/s1$ndisc)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0000 0.0000 0.2000 0.3098 0.5000 1.0000 537
summary(s1$nrepl_overestimated/s1$nrepl)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0000 0.0000 0.1875 0.2978 0.5000 1.0000 544
hist(s1$nhalf_overestimated / s1$nhalf, breaks=100)
hist(s1$nhalf_weak / s1$nhalf, breaks=100)
ggplot(s1, aes(x=ndisc, y=ndisc_overestimated / ndisc)) +
geom_point()
ggplot(s1, aes(x=ndisc, y=ndisc_weak / ndisc)) +
geom_point()
p1 <- bind_rows(
l %>% dplyr::select(nsig=nsig, nsnp=nsnp, nsig_overestimate) %>% mutate(what="Simulations"),
s1 %>% dplyr::select(nsig=ndisc, nsig_overestimate=ndisc_overestimated) %>% mutate(what="UK Biobank")
) %>%
{
ggplot(., aes(x=nsig, y=nsig_overestimate/nsig)) +
geom_point(data=subset(., what !="UK Biobank"), aes(colour=as.factor(nsnp)), size=0.1, alpha=0.5) +
geom_smooth(data=subset(., what !="UK Biobank"), aes(colour=as.factor(nsnp)), se=FALSE) +
geom_point(data=subset(., what =="UK Biobank")) +
scale_colour_brewer(type="seq", palette=2) +
# scale_x_log10() +
xlim(c(0,500)) +
labs(x="Discovered associations", y="Proportion overestimated", colour="Number of causal variants", shape="")
}
p1
s1 %>%
mutate(propover=ndisc_overestimated/ndisc, propweak=ndisc_weak/ndisc) %>%
dplyr::select(id, propover, propweak) %>%
pivot_longer(cols=c(propover, propweak)) %>%
ggplot(., aes(x=value)) +
geom_histogram(aes(fill=name), position="dodge", bins=100)
# geom_density(aes(fill=name), alpha=0.2)
ggplot(s1, aes(x=ndisc_overestimated/ndisc)) +
geom_histogram(bins=100)
g_legend<-function(a.gplot)
{
tmp <- ggplot_gtable(ggplot_build(a.gplot))
leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
legend <- tmp$grobs[[leg]]
return(legend)
}
mylegend <- g_legend(p0 + theme(legend.position="bottom"))
p4 <- grid.arrange(
mylegend, arrangeGrob(p0 + theme(legend.position="none") + labs(title="A"),
p1 + theme(legend.position="none") + labs(title="B", y=""),
nrow=2),
nrow=2, heights=c(1, 10)
)